# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1069.3.1+1.1069.9.1 -> 1.1069.3.2 # include/linux/sysctl.h 1.23.1.9 -> 1.30 # include/linux/mmzone.h 1.9.1.2 -> 1.12 # include/linux/mm.h 1.39.1.4 -> 1.47 # mm/page_alloc.c 1.56.1.4 -> 1.59 # include/linux/sched.h 1.33.1.7 -> 1.37.1.3 # kernel/sysctl.c 1.19.1.8 -> 1.23.1.2 # Makefile 1.190.1.51 -> 1.193.1.29 # fs/proc/proc_misc.c 1.20.1.3 -> 1.23.1.2 # include/asm-ppc/pgtable.h 1.20 -> 1.21 # drivers/char/serial.c 1.33.1.3 -> 1.43 # arch/i386/kernel/mpparse.c 1.27.3.1 -> 1.30.1.3 # drivers/pci/pci.ids 1.44.1.2 -> 1.46 # mm/mmap.c 1.25.1.6 -> 1.29.1.6 # kernel/sched.c 1.30.1.4 -> 1.34.1.1 # Documentation/Configure.help 1.162.1.31 -> 1.166.1.10 # drivers/char/hcdp_serial.c 1.3.1.1 -> 1.5 # diff -Nru a/Documentation/Configure.help b/Documentation/Configure.help --- a/Documentation/Configure.help Wed Oct 8 09:06:28 2003 +++ b/Documentation/Configure.help Wed Oct 8 09:06:28 2003 @@ -18210,6 +18210,11 @@ purpose port, say Y here. See . +Support for serial ports defined in ACPI namespace +CONFIG_SERIAL_ACPI + If you wish to enable serial port discovery via the ACPI + namespace, say Y here. If unsure, say N. + Support for PowerMac serial ports CONFIG_MAC_SERIAL If you have Macintosh style serial ports (8 pin mini-DIN), say Y @@ -27000,11 +27005,13 @@ # Choice: ia64type Itanium CONFIG_ITANIUM - Select your IA64 processor type. The default is Intel Itanium. + Select your IA-64 processor type. The default is Intel Itanium. + This choice is safe for all IA-64 systems, but may not perform + optimally on systems with, say, Itanium 2 or newer processors. -McKinley +Itanium 2 CONFIG_MCKINLEY - Select this to configure for a McKinley processor. + Select this to configure for an Itanium 2 (McKinley) processor. # Choice: ia64system IA-64 system type @@ -27060,6 +27067,14 @@ CONFIG_IA64_MCA Say Y here to enable machine check support for IA-64. If you're unsure, answer Y. + +Use PAL_HALT_LIGHT in idle loop +CONFIG_IA64_PAL_IDLE + Say Y here to enable use of PAL_HALT_LIGHT in the cpu_idle loop. + This allows the CPU to enter a low power state when idle. You + can enable CONFIG_IA64_PALINFO and check /proc/pal/cpu0/power_info + to see the power consumption and latency for this state. If you're + unsure your firmware supports it, answer N. Disable IA-64 Virtual Hash Page Table CONFIG_DISABLE_VHPT diff -Nru a/Makefile b/Makefile --- a/Makefile Wed Oct 8 09:06:28 2003 +++ b/Makefile Wed Oct 8 09:06:28 2003 @@ -93,6 +93,7 @@ CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ -fno-strict-aliasing -fno-common +CFLAGS += -g ifndef CONFIG_FRAME_POINTER CFLAGS += -fomit-frame-pointer endif @@ -305,8 +306,7 @@ $(CONFIG_SHELL) scripts/Configure -d arch/$(ARCH)/config.in xconfig: symlinks - $(MAKE) -C scripts kconfig.tk - wish -f scripts/kconfig.tk + @echo -e "***\n* Sorry, xconfig is broken; use \"make menuconfig\" instead.\n***" menuconfig: include/linux/version.h symlinks $(MAKE) -C scripts/lxdialog all diff -Nru a/drivers/char/serial.c b/drivers/char/serial.c --- a/drivers/char/serial.c Wed Oct 8 09:06:28 2003 +++ b/drivers/char/serial.c Wed Oct 8 09:06:28 2003 @@ -92,9 +92,8 @@ * ever possible. * * CONFIG_SERIAL_ACPI - * Enable support for serial console port and serial - * debug port as defined by the SPCR and DBGP tables in - * ACPI 2.0. + * Enable support for serial ports found in the ACPI + * namespace. */ #include @@ -222,6 +221,10 @@ #ifdef CONFIG_MAGIC_SYSRQ #include #endif +#ifdef ENABLE_SERIAL_ACPI +#include +#include +#endif /* * All of the compatibilty code so we can compile serial.c against @@ -257,6 +260,10 @@ static struct timer_list serial_timer; +#define HP_DIVA_CHECKTIME (1*HZ) +static struct timer_list hp_diva_timer; +static int hp_diva_count = 0; + /* serial subtype definitions */ #ifndef SERIAL_TYPE_NORMAL #define SERIAL_TYPE_NORMAL 1 @@ -793,6 +800,41 @@ } #ifdef CONFIG_SERIAL_SHARE_IRQ +static inline int is_hp_diva_info(struct async_struct *info) +{ + struct pci_dev *dev = info->state->dev; + return (dev && dev->vendor == PCI_VENDOR_ID_HP && + dev->device == PCI_DEVICE_ID_HP_SAS); +} + +static inline int is_hp_diva_irq(int irq) +{ + struct async_struct *info = IRQ_ports[irq]; + return (info && is_hp_diva_info(info)); +} + +/* + * It is possible to "use up" transmit empty interrupts in some + * cases with HP Diva cards. Figure out if there _should_ be a + * transmit interrupt and if so, return a suitable iir value so + * that we can recover when called from rs_timer(). + */ +static inline int hp_diva_iir(int irq, struct async_struct *info) +{ + int iir = serial_in(info, UART_IIR); + + if (is_hp_diva_info(info) && + (iir & UART_IIR_NO_INT) != 0 && + (info->IER & UART_IER_THRI) != 0 && + (info->xmit.head != info->xmit.tail || info->x_char) && + (serial_in(info, UART_LSR) & UART_LSR_THRE) != 0) { + iir &= ~(UART_IIR_ID | UART_IIR_NO_INT); + iir |= UART_IIR_THRI; + } + + return iir; +} + /* * This is the serial driver's generic interrupt routine */ @@ -823,7 +865,7 @@ do { if (!info->tty || - ((iir=serial_in(info, UART_IIR)) & UART_IIR_NO_INT)) { + ((iir=hp_diva_iir(irq, info)) & UART_IIR_NO_INT)) { if (!end_mark) end_mark = info; goto next; @@ -1092,9 +1134,11 @@ #ifdef CONFIG_SERIAL_SHARE_IRQ if (info->next_port) { do { - serial_out(info, UART_IER, 0); - info->IER |= UART_IER_THRI; - serial_out(info, UART_IER, info->IER); + if (!is_hp_diva_info(info)) { + serial_out(info, UART_IER, 0); + info->IER |= UART_IER_THRI; + serial_out(info, UART_IER, info->IER); + } info = info->next_port; } while (info); #ifdef CONFIG_SERIAL_MULTIPORT @@ -1126,6 +1170,35 @@ } /* + * This subroutine is called when the hp_diva_timer goes off. In + * certain cases (multiple gettys in particular) Diva seems to issue + * only a single transmit empty interrupt instead of one each time + * THRI is enabled, causing interrupts to be "used up". This serves + * to poll the Diva UARTS more frequently than rs_timer() does. + */ +static void hp_diva_check(unsigned long dummy) +{ +#ifdef CONFIG_SERIAL_SHARE_IRQ + static unsigned long last_strobe; + unsigned long flags; + int i; + + if (time_after_eq(jiffies, last_strobe + HP_DIVA_CHECKTIME)) { + for (i = 0; i < NR_IRQS; i++) { + if (is_hp_diva_irq(i)) { + save_flags(flags); cli(); + rs_interrupt(i, NULL, NULL); + restore_flags(flags); + } + } + } + last_strobe = jiffies; + mod_timer(&hp_diva_timer, jiffies + HP_DIVA_CHECKTIME); +#endif +} + + +/* * --------------------------------------------------------------- * Low level utility subroutines for the serial driver: routines to * figure out the appropriate timeout for an interrupt chain, routines @@ -4281,6 +4354,12 @@ break; } + if (hp_diva_count++ == 0) { + init_timer(&hp_diva_timer); + hp_diva_timer.function = hp_diva_check; + mod_timer(&hp_diva_timer, jiffies + HP_DIVA_CHECKTIME); + } + return 0; } @@ -4584,6 +4663,129 @@ } } +#ifdef ENABLE_SERIAL_ACPI +static acpi_status acpi_serial_address(struct serial_struct *req, + struct acpi_resource_address64 *addr) +{ + unsigned long size; + + size = addr->max_address_range - addr->min_address_range + 1; + req->iomem_base = ioremap(addr->min_address_range, size); + if (!req->iomem_base) { + printk("%s: couldn't ioremap 0x%lx-0x%lx\n", __FUNCTION__, + addr->min_address_range, addr->max_address_range); + return AE_ERROR; + } + req->io_type = SERIAL_IO_MEM; + return AE_OK; +} + +static acpi_status acpi_serial_ext_irq(struct serial_struct *req, + struct acpi_resource_ext_irq *ext_irq) +{ + if (ext_irq->number_of_interrupts > 0) { +#ifdef CONFIG_IA64 + req->irq = acpi_register_irq(ext_irq->interrupts[0], + ext_irq->active_high_low, ext_irq->edge_level); +#else + req->irq = ext_irq->interrupts[0]; +#endif + } + return AE_OK; +} + +static acpi_status acpi_serial_port(struct serial_struct *req, + struct acpi_resource_io *io) +{ + req->port = io->min_base_address; + req->io_type = SERIAL_IO_PORT; + return AE_OK; +} + +static acpi_status acpi_serial_irq(struct serial_struct *req, + struct acpi_resource_irq *irq) +{ + if (irq->number_of_interrupts > 0) { +#ifdef CONFIG_IA64 + req->irq = acpi_register_irq(irq->interrupts[0], + irq->active_high_low, irq->edge_level); +#else + req->irq = irq->interrupts[0]; +#endif + } + return AE_OK; +} + +static acpi_status acpi_serial_resource(struct acpi_resource *res, void *data) +{ + struct serial_struct *serial_req = (struct serial_struct *) data; + struct acpi_resource_address64 addr; + acpi_status status; + + status = acpi_resource_to_address64(res, &addr); + if (ACPI_SUCCESS(status)) + return acpi_serial_address(serial_req, &addr); + else if (res->id == ACPI_RSTYPE_EXT_IRQ) + return acpi_serial_ext_irq(serial_req, &res->data.extended_irq); + else if (res->id == ACPI_RSTYPE_IO) + return acpi_serial_port(serial_req, &res->data.io); + else if (res->id == ACPI_RSTYPE_IRQ) + return acpi_serial_irq(serial_req, &res->data.irq); + return AE_OK; +} + +static int acpi_serial_add(struct acpi_device *device) +{ + acpi_status status; + struct serial_struct serial_req; + int line; + + memset(&serial_req, 0, sizeof(serial_req)); + + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, + acpi_serial_resource, &serial_req); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!serial_req.iomem_base && !serial_req.port) { + printk("%s: no iomem or port address in %s _CRS\n", __FUNCTION__, + device->pnp.bus_id); + return -ENODEV; + } + + serial_req.baud_base = BASE_BAUD; + serial_req.flags = ASYNC_SKIP_TEST|ASYNC_BOOT_AUTOCONF|ASYNC_AUTO_IRQ; + serial_req.xmit_fifo_size = serial_req.custom_divisor = 0; + serial_req.close_delay = serial_req.hub6 = serial_req.closing_wait = 0; + serial_req.iomem_reg_shift = 0; + + line = register_serial(&serial_req); + if (line < 0) + return -ENODEV; + + return 0; +} + +static int acpi_serial_remove(struct acpi_device *device, int type) +{ + return 0; +} + +static struct acpi_driver acpi_serial_driver = { + .name = "serial", + .class = "", + .ids = "PNP0501", + .ops = { + .add = acpi_serial_add, + .remove = acpi_serial_remove, + }, +}; + +static void __devinit probe_serial_acpi(void) +{ + acpi_bus_register_driver(&acpi_serial_driver); +} +#endif /* ENABLE_SERIAL_ACPI */ static struct pci_device_id serial_pci_tbl[] __devinitdata = { { PCI_VENDOR_ID_V3, PCI_DEVICE_ID_V3_V960, @@ -5545,6 +5747,9 @@ tty_register_devfs(&callout_driver, 0, callout_driver.minor_start + state->line); } +#ifdef ENABLE_SERIAL_ACPI + probe_serial_acpi(); +#endif #ifdef ENABLE_SERIAL_PCI probe_serial_pci(); #endif @@ -5722,6 +5927,8 @@ /* printk("Unloading %s: version %s\n", serial_name, serial_version); */ del_timer_sync(&serial_timer); + if (hp_diva_count > 0) + del_timer_sync(&hp_diva_timer); save_flags(flags); cli(); remove_bh(SERIAL_BH); if ((e1 = tty_unregister_driver(&serial_driver))) diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Wed Oct 8 09:06:28 2003 +++ b/fs/proc/proc_misc.c Wed Oct 8 09:06:28 2003 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -209,6 +210,8 @@ K(i.freeram-i.freehigh), K(i.totalswap), K(i.freeswap)); + + len += hugetlb_report_meminfo(page + len); return proc_calc_metrics(page, start, off, count, eof, len); #undef B diff -Nru a/include/asm-ppc/pgtable.h b/include/asm-ppc/pgtable.h --- a/include/asm-ppc/pgtable.h Wed Oct 8 09:06:28 2003 +++ b/include/asm-ppc/pgtable.h Wed Oct 8 09:06:28 2003 @@ -109,6 +109,13 @@ extern unsigned long va_to_phys(unsigned long address); extern pte_t *va_to_pte(unsigned long address); extern unsigned long ioremap_bot, ioremap_base; +extern unsigned long vmalloc_start; + +/* Start and end of the vmalloc area. */ +#define VMALLOC_START vmalloc_start +#define VMALLOC_END ioremap_bot +#define VMALLOC_VMADDR(x) ((unsigned long)(x)) + #endif /* __ASSEMBLY__ */ /* @@ -194,32 +201,6 @@ printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* - * Just any arbitrary offset to the start of the vmalloc VM area: the - * current 64MB value just means that there will be a 64MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - * - * We no longer map larger than phys RAM with the BATs so we don't have - * to worry about the VMALLOC_OFFSET causing problems. We do have to worry - * about clashes between our early calls to ioremap() that start growing down - * from ioremap_base being run into the VM area allocations (growing upwards - * from VMALLOC_START). For this reason we have ioremap_bot to check when - * we actually run into our mappings setup in the early boot with the VM - * system. This really does become a problem for machines with good amounts - * of RAM. -- Cort - */ -#define VMALLOC_OFFSET (0x1000000) /* 16M */ -#ifdef CONFIG_44x -#define VMALLOC_START (((_ALIGN((long)high_memory, PPC44x_PIN_SIZE) + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) -#else -#define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) -#endif -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#define VMALLOC_END ioremap_bot /* * Bits in a linux-style PTE. These match the bits in the diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Wed Oct 8 09:06:28 2003 +++ b/include/linux/mm.h Wed Oct 8 09:06:28 2003 @@ -103,6 +103,9 @@ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_WRITECOMBINED 0x00100000 /* Write-combined */ +#define VM_NONCACHED 0x00200000 /* Noncached access */ +#define VM_HUGETLB 0x00400000 /* Huge tlb Page*/ #ifndef VM_STACK_FLAGS #define VM_STACK_FLAGS 0x00000177 diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Wed Oct 8 09:06:28 2003 +++ b/include/linux/mmzone.h Wed Oct 8 09:06:28 2003 @@ -8,6 +8,12 @@ #include #include #include +#ifdef CONFIG_DISCONTIGMEM +#include +#endif +#ifndef MAX_NUMNODES +#define MAX_NUMNODES 1 +#endif /* * Free memory management - zoned buddy allocator. @@ -118,7 +124,8 @@ * rarely used fields: */ char *name; - unsigned long size; + unsigned long totalsize; + unsigned long memsize; unsigned long realsize; } zone_t; @@ -134,7 +141,7 @@ * footprint of this construct is very small. */ typedef struct zonelist_struct { - zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited + zone_t * zones [MAX_NUMNODES*MAX_NR_ZONES+1]; // NULL delimited } zonelist_t; #define GFP_ZONEMASK 0x0f @@ -236,6 +243,18 @@ #define for_each_zone(zone) \ for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +#ifdef CONFIG_NUMA +#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ +#include +#else /* !CONFIG_NUMA */ +#define MAX_NR_MEMBLKS 1 +#endif /* CONFIG_NUMA */ + +/* Returns the number of the current Node. */ + +#ifndef CONFIG_NUMA +#define numa_node_id() (__cpu_to_node(smp_processor_id())) +#endif #ifndef CONFIG_DISCONTIGMEM diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h Wed Oct 8 09:06:28 2003 +++ b/include/linux/sysctl.h Wed Oct 8 09:06:28 2003 @@ -154,6 +154,7 @@ VM_GFP_DEBUG=18, /* debug GFP failures */ VM_CACHE_SCAN_RATIO=19, /* part of the inactive cache list to scan */ VM_MAPPED_RATIO=20, /* amount of unfreeable pages that triggers swapout */ + VM_HUGETLB_PAGES=21, /* int: Number of available Huge Pages */ }; diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c --- a/kernel/sysctl.c Wed Oct 8 09:06:28 2003 +++ b/kernel/sysctl.c Wed Oct 8 09:06:28 2003 @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -306,6 +307,10 @@ &vm_max_readahead,sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MAX_MAP_COUNT, "max_map_count", &max_map_count, sizeof(int), 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_HUGETLB_PAGE + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, + &hugetlb_sysctl_handler}, +#endif {0} }; diff -Nru a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c Wed Oct 8 09:06:28 2003 +++ b/mm/mmap.c Wed Oct 8 09:06:28 2003 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -594,7 +595,10 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + if (is_vm_hugetlb_page(vma)) + zap_hugepage_range(vma, vma->vm_start, vma->vm_end-vma->vm_start); + else + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -644,10 +648,26 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (flags & MAP_FIXED) { + unsigned long ret; + if (addr > TASK_SIZE - len) return -ENOMEM; if (addr & ~PAGE_MASK) return -EINVAL; + if (file && is_file_hugepages(file)) + /* If the request is for hugepages, then make sure + * that addr and length is properly aligned. + */ + ret = is_aligned_hugepage_range(addr, len); + else + /* + * Make sure that a normal request is not falling + * in reserved hugepage range. For some archs like + * IA-64, there is a separate region for hugepages. + */ + ret = is_invalid_hugepage_range(addr, len); + if (ret) + return ret; return addr; } @@ -941,6 +961,12 @@ return 0; /* we have addr < mpnt->vm_end */ + if (is_vm_hugetlb_page(mpnt)) { + int ret = is_aligned_hugepage_range(addr, len); + if (ret) + return ret; + } + if (mpnt->vm_start >= addr+len) return 0; @@ -994,7 +1020,10 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + if (is_vm_hugetlb_page(mpnt)) + zap_hugepage_range(mpnt, st, size); + else + zap_page_range(mm, st, size); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1151,7 +1180,10 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + if (is_vm_hugetlb_page(mpnt)) + zap_hugepage_range(mpnt, start, size); + else + zap_page_range(mm, start, size); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Wed Oct 8 09:06:28 2003 +++ b/mm/page_alloc.c Wed Oct 8 09:06:28 2003 @@ -49,11 +49,11 @@ /* * Temporary debugging check. */ -#define BAD_RANGE(zone, page) \ -( \ - (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ - || (((page) - mem_map) < (zone)->zone_start_mapnr) \ - || ((zone) != page_zone(page)) \ +#define BAD_RANGE(zone, page) \ +( \ + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->totalsize)) \ + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ + || ((zone) != page_zone(page)) \ ) /* @@ -577,7 +577,7 @@ unsigned long nr, total, flags; total = 0; - if (zone->size) { + if (zone->memsize) { spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { head = &(zone->free_area + order)->free_list; @@ -609,13 +609,44 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k) { - int i, j, k; + zone_t *zone; + switch (k) { + default: + BUG(); + /* + * fallthrough: + */ + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->memsize) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->memsize) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->memsize) + zonelist->zones[j++] = zone; + } + + return j; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + local_node = pgdat->node_id; + printk("Building zonelist for node : %d\n", local_node); for (i = 0; i <= GFP_ZONEMASK; i++) { zonelist_t *zonelist; - zone_t *zone; zonelist = pgdat->node_zonelists + i; memset(zonelist, 0, sizeof(*zonelist)); @@ -627,33 +658,32 @@ if (i & __GFP_DMA) k = ZONE_DMA; - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->size) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; - } + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j++] = NULL; } } +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); +} + /* * Helper functions to size the waitqueue hash table. * Essentially these want to choose hash table sizes sufficiently @@ -696,6 +726,31 @@ return ffz(~size); } +static unsigned long memmap_init(struct page *start, struct page *end, + int zone, unsigned long start_paddr, int highmem) +{ + struct page *page; + + for (page = start; page < end; page++) { + set_page_zone(page, zone); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->list); + if (!highmem) + set_page_address(page, __va(start_paddr)); + start_paddr += PAGE_SIZE; + } + return start_paddr; +} + +#ifdef HAVE_ARCH_MEMMAP_INIT +#define MEMMAP_INIT(start, end, zone, paddr, highmem) \ + arch_memmap_init(memmap_init, start, end, zone, paddr, highmem) +#else +#define MEMMAP_INIT(start, end, zone, paddr, highmem) \ + memmap_init(start, end, zone, paddr, highmem) +#endif + #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* @@ -717,10 +772,8 @@ BUG(); totalpages = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long size = zones_size[i]; - totalpages += size; - } + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; realtotalpages = totalpages; if (zholes_size) for (i = 0; i < MAX_NR_ZONES; i++) @@ -729,7 +782,7 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); /* - * Some architectures (with lots of mem and discontinous memory + * Some architectures (with lots of mem and discontigous memory * maps) have to search for a good mem_map area: * For discontigmem, the conceptual mem map array starts from * PAGE_OFFSET, we need to align the actual array onto a mem map @@ -742,7 +795,7 @@ MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); } *gmap = pgdat->node_mem_map = lmem_map; - pgdat->node_size = totalpages; + pgdat->node_size = 0; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); pgdat->nr_zones = 0; @@ -759,8 +812,9 @@ if (zholes_size) realsize -= zholes_size[j]; - printk("zone(%lu): %lu pages.\n", j, size); - zone->size = size; + printk("zone(%lu): %lu pages.\n", j, realsize); + zone->totalsize = size; + zone->memsize = realsize; zone->realsize = realsize; zone->name = zone_names[j]; zone->lock = SPIN_LOCK_UNLOCKED; @@ -770,6 +824,7 @@ zone->nr_active_pages = zone->nr_inactive_pages = 0; + pgdat->node_size += realsize; if (!size) continue; @@ -830,16 +885,10 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ - for (i = 0; i < size; i++) { - struct page *page = mem_map + offset + i; - set_page_zone(page, nid * MAX_NR_ZONES + j); - set_page_count(page, 0); - SetPageReserved(page); - INIT_LIST_HEAD(&page->list); - if (j != ZONE_HIGHMEM) - set_page_address(page, __va(zone_start_paddr)); - zone_start_paddr += PAGE_SIZE; - } + zone_start_paddr = MEMMAP_INIT(mem_map + offset, + mem_map + offset + size, + nid * MAX_NR_ZONES + j, zone_start_paddr, + (j == ZONE_HIGHMEM ? 1 : 0)); offset += size; for (i = 0; ; i++) { @@ -880,7 +929,6 @@ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); } } - build_zonelists(pgdat); } void __init free_area_init(unsigned long *zones_size)